data <- read.csv("UCI-electricity/UCI_data.csv")
# Display the first few rows of the data
head(data)
## date T1 RH_1 T2 RH_2 T3 RH_3
## 1 2016-04-19 20:30:00 22.20000 39.50000 20.56667 37.65667 22.23000 37.03000
## 2 2016-03-05 04:40:00 20.35667 37.12667 17.56667 40.23000 20.89000 37.66333
## 3 2016-03-14 12:40:00 20.92667 38.79000 21.10000 35.52667 21.60000 36.29000
## 4 2016-01-22 15:30:00 18.29000 38.90000 17.29000 39.26000 18.39000 39.32667
## 5 2016-02-10 00:40:00 22.29000 42.33333 21.60000 40.43333 22.66667 43.36333
## 6 2016-02-16 15:40:00 21.79000 34.73000 19.76000 35.40000 21.23000 35.20000
## T4 RH_4 T5 RH_5 T6 RH_6 T7 RH_7 T8
## 1 22.31857 36.61000 20.63333 62.16667 8.823333 13.29667 21.00 29.31857 22.23
## 2 18.70000 36.26000 18.46333 43.56000 1.230000 78.96333 18.39 32.29000 21.00
## 3 21.00000 34.82667 18.10000 46.12667 6.033333 37.30000 21.29 27.56667 21.70
## 4 16.10000 38.79000 16.10000 47.70000 4.595000 94.94500 16.20 33.59000 17.78
## 5 19.10000 40.90000 19.29000 50.74500 2.530000 92.19000 20.00 40.00000 23.10
## 6 20.10000 34.76000 19.03333 38.40000 4.500000 46.70000 21.76 26.26333 22.60
## RH_8 T9 RH_9 T_out Press_mm_hg RH_out Windspeed Visibility
## 1 38.46333 19.92667 33.90 9.700000 766.1000 65.50000 3.500000 40.00000
## 2 43.40000 18.39000 41.09 0.300000 740.3333 99.00000 1.000000 41.33333
## 3 32.44750 19.39000 38.76 4.400000 768.4667 72.00000 6.000000 22.66667
## 4 41.66333 15.80000 39.20 3.350000 760.6000 82.00000 5.500000 41.00000
## 5 46.09000 18.89000 43.73 3.200000 738.9000 88.00000 7.333333 56.00000
## 6 32.66333 18.29000 40.20 4.633333 770.8667 52.33333 2.333333 40.00000
## Tdewpoint rv1 rv2 TARGET_energy
## 1 3.3500000 24.061869 24.061869 60
## 2 0.1000000 4.622052 4.622052 50
## 3 -0.2666667 5.635898 5.635898 80
## 4 0.5000000 49.216445 49.216445 40
## 5 1.4000000 47.617579 47.617579 60
## 6 -4.3333333 16.431218 16.431218 50
# Display the last few rows of the data
tail(data)
## date T1 RH_1 T2 RH_2 T3 RH_3
## 19730 2016-02-09 18:40:00 22.39 41.40 21.96333 40.00000 22.39000 41.66333
## 19731 2016-01-27 06:30:00 20.10 44.90 19.39000 44.17250 19.70000 44.00000
## 19732 2016-04-14 08:50:00 21.39 41.09 19.23000 44.63333 22.79000 38.03000
## 19733 2016-04-25 08:30:00 20.89 36.50 18.00000 40.23000 22.13333 34.43333
## 19734 2016-03-04 06:50:00 20.79 36.70 18.79000 38.24500 21.00000 38.79000
## 19735 2016-02-16 10:20:00 19.79 39.20 18.23000 39.40000 21.55000 37.94000
## T4 RH_4 T5 RH_5 T6 RH_6 T7 RH_7
## 19730 19.39000 40.70000 19.50000 49.18778 3.723333 92.30000 19.87889 40.90444
## 19731 19.06667 46.46667 17.61111 56.43333 10.263333 85.73000 18.20000 47.30222
## 19732 22.53333 39.36000 20.39000 44.59000 7.900000 62.29667 20.50000 34.55846
## 19733 20.70000 34.61200 20.00000 42.56000 6.433333 53.86667 21.29000 31.70000
## 19734 19.29000 36.11750 18.17143 51.50000 4.051429 79.75286 18.70000 32.48571
## 19735 19.23000 35.73000 18.20000 42.29000 -0.600000 86.06000 18.29000 29.89000
## T8 RH_8 T9 RH_9 T_out Press_mm_hg RH_out
## 19730 23.39000 44.72000 19.0000 43.09000 4.033333 735.7667 93.0000
## 19731 17.79000 49.59000 17.2300 49.59000 10.300000 757.3500 80.5000
## 19732 22.10000 40.79000 21.9725 38.34750 4.116667 754.3000 100.0000
## 19733 23.39000 37.79000 20.0000 36.32667 4.000000 754.8500 90.0000
## 19734 20.89000 44.82143 18.5000 41.72571 3.807143 740.5738 91.2381
## 19735 20.13333 37.96667 17.7900 37.43333 0.500000 772.2333 95.0000
## Windspeed Visibility Tdewpoint rv1 rv2 TARGET_energy
## 19730 4.000000 40.0 2.9666667 19.390282 19.390282 140
## 19731 10.000000 34.5 7.0000000 2.233453 2.233453 40
## 19732 1.000000 33.0 4.1166667 18.511994 18.511994 100
## 19733 3.000000 25.5 2.4500000 6.176545 6.176545 70
## 19734 7.285714 58.5 2.4547619 4.780297 4.780297 50
## 19735 1.333333 62.0 -0.2666667 23.204486 23.204486 280
# Display the structure of the data
str(data)
## 'data.frame': 19735 obs. of 28 variables:
## $ date : chr "2016-04-19 20:30:00" "2016-03-05 04:40:00" "2016-03-14 12:40:00" "2016-01-22 15:30:00" ...
## $ T1 : num 22.2 20.4 20.9 18.3 22.3 ...
## $ RH_1 : num 39.5 37.1 38.8 38.9 42.3 ...
## $ T2 : num 20.6 17.6 21.1 17.3 21.6 ...
## $ RH_2 : num 37.7 40.2 35.5 39.3 40.4 ...
## $ T3 : num 22.2 20.9 21.6 18.4 22.7 ...
## $ RH_3 : num 37 37.7 36.3 39.3 43.4 ...
## $ T4 : num 22.3 18.7 21 16.1 19.1 ...
## $ RH_4 : num 36.6 36.3 34.8 38.8 40.9 ...
## $ T5 : num 20.6 18.5 18.1 16.1 19.3 ...
## $ RH_5 : num 62.2 43.6 46.1 47.7 50.7 ...
## $ T6 : num 8.82 1.23 6.03 4.59 2.53 ...
## $ RH_6 : num 13.3 79 37.3 94.9 92.2 ...
## $ T7 : num 21 18.4 21.3 16.2 20 ...
## $ RH_7 : num 29.3 32.3 27.6 33.6 40 ...
## $ T8 : num 22.2 21 21.7 17.8 23.1 ...
## $ RH_8 : num 38.5 43.4 32.4 41.7 46.1 ...
## $ T9 : num 19.9 18.4 19.4 15.8 18.9 ...
## $ RH_9 : num 33.9 41.1 38.8 39.2 43.7 ...
## $ T_out : num 9.7 0.3 4.4 3.35 3.2 ...
## $ Press_mm_hg : num 766 740 768 761 739 ...
## $ RH_out : num 65.5 99 72 82 88 ...
## $ Windspeed : num 3.5 1 6 5.5 7.33 ...
## $ Visibility : num 40 41.3 22.7 41 56 ...
## $ Tdewpoint : num 3.35 0.1 -0.267 0.5 1.4 ...
## $ rv1 : num 24.06 4.62 5.64 49.22 47.62 ...
## $ rv2 : num 24.06 4.62 5.64 49.22 47.62 ...
## $ TARGET_energy: int 60 50 80 40 60 50 60 400 80 100 ...
library(striprtf)
# Specify the path to your RTF file
file_path <- "UCI-electricity/description.rtf"
# Read the RTF file
rtf_content <- read_rtf(file_path)
# Display the content
cat(rtf_content)
## *| Attribute Information:date time year-month-day hour:minute:second T1, Temperature in kitchen area, in Celsius RH_1, Humidity in kitchen area, in % T2, Temperature in living room area, in Celsius RH_2, Humidity in living room area, in % T3, Temperature in laundry room area RH_3, Humidity in laundry room area, in % T4, Temperature in office room, in Celsius RH_4, Humidity in office room, in % T5, Temperature in bathroom, in Celsius RH_5, Humidity in bathroom, in % T6, Temperature outside the building (north side), in Celsius RH_6, Humidity outside the building (north side), in % T7, Temperature in ironing room , in Celsius RH_7, Humidity in ironing room, in % T8, Temperature in teenager room 2, in Celsius RH_8, Humidity in teenager room 2, in % T9, Temperature in parents room, in Celsius RH_9, Humidity in parents room, in % To, Temperature outside (from Chievres weather station), in Celsius Pressure (from Chievres weather station), in mm Hg RH_out, Humidity outside (from Chievres weather station), in % Wind speed (from Chievres weather station), in m/s Visibility (from Chievres weather station), in km Tdewpoint (from Chievres weather station), 슰C rv1, Random variable 1, nondimensional rv2, Random variable 2, nondimensional TARGET_Energy, energy use of Appliances and light fixtures in the house in Wh |
# Count of missing values in each column
colSums(is.na(data))
## date T1 RH_1 T2 RH_2
## 0 0 0 0 0
## T3 RH_3 T4 RH_4 T5
## 0 0 0 0 0
## RH_5 T6 RH_6 T7 RH_7
## 0 0 0 0 0
## T8 RH_8 T9 RH_9 T_out
## 0 0 0 0 0
## Press_mm_hg RH_out Windspeed Visibility Tdewpoint
## 0 0 0 0 0
## rv1 rv2 TARGET_energy
## 0 0 0
# Percentage of missing values in each column
colMeans(is.na(data)) * 100
## date T1 RH_1 T2 RH_2
## 0 0 0 0 0
## T3 RH_3 T4 RH_4 T5
## 0 0 0 0 0
## RH_5 T6 RH_6 T7 RH_7
## 0 0 0 0 0
## T8 RH_8 T9 RH_9 T_out
## 0 0 0 0 0
## Press_mm_hg RH_out Windspeed Visibility Tdewpoint
## 0 0 0 0 0
## rv1 rv2 TARGET_energy
## 0 0 0
# Set up the plotting area for multiple plots
par(mfrow=c(3, 1), mar=c(4, 4, 2, 1), oma=c(0, 0, 2, 0))
# Plot the histograms for temperature variables
par(mfrow=c(2, 5))
hist(data$T1, main="Kitchen", xlab="T1 (Celsius)", col="blue")
hist(data$T2, main="Living Room", xlab="T2 (Celsius)", col="green")
hist(data$T3, main="Laundry Room", xlab="T3 (Celsius)", col="red")
hist(data$T4, main="Office Room", xlab="T4 (Celsius)", col="purple")
hist(data$T5, main="Bathroom", xlab="T5 (Celsius)", col="orange")
hist(data$T6, main="Outside North", xlab="T6 (Celsius)", col="yellow")
hist(data$T7, main="Ironing Room", xlab="T7 (Celsius)", col="pink")
hist(data$T8, main="Teenager Room 2", xlab="T8 (Celsius)", col="cyan")
hist(data$T9, main="Parents Room", xlab="T9 (Celsius)", col="brown")
hist(data$T_out, main="Outside Chievres", xlab="T_out (Celsius)", col="grey")
# Add an overall title for the temperature histograms
mtext("Histograms of Temperature Variables", outer = TRUE, line = -2, cex = 1.5)
# Plot the histograms for humidity variables
par(mfrow=c(2, 5))
hist(data$RH_1, main="Kitchen", xlab="RH_1 (%)", col="blue")
hist(data$RH_2, main="Living Room", xlab="RH_2 (%)", col="green")
hist(data$RH_3, main="Laundry Room", xlab="RH_3 (%)", col="red")
hist(data$RH_4, main="Office Room", xlab="RH_4 (%)", col="purple")
hist(data$RH_5, main="Bathroom", xlab="RH_5 (%)", col="orange")
hist(data$RH_6, main="Outside North", xlab="RH_6 (%)", col="yellow")
hist(data$RH_7, main="Ironing Room", xlab="RH_7 (%)", col="pink")
hist(data$RH_8, main="Teenager Room 2", xlab="RH_8 (%)", col="cyan")
hist(data$RH_9, main="Parents Room", xlab="RH_9 (%)", col="brown")
hist(data$RH_out, main="Outside Chievres", xlab="RH_out (%)", col="grey")
# Add an overall title for the humidity histograms
mtext("Histograms of Humidity Variables", outer = TRUE, line = -22, cex = 1.5)
# Plot the histograms for additional numerical variables
par(mfrow=c(2, 3))
hist(data$Press_mm_hg, main="Pressure (Chievres)", xlab="Press_mm_hg (mm Hg)", col="blue")
hist(data$Windspeed, main="Wind Speed (Chievres)", xlab="Windspeed (m/s)", col="green")
hist(data$Visibility, main="Visibility (Chievres)", xlab="Visibility (km)", col="red")
hist(data$Tdewpoint, main="Dewpoint (Chievres)", xlab="Tdewpoint (Celsius)", col="purple")
hist(data$rv1, main="Random Variable 1", xlab="rv1", col="orange")
hist(data$rv2, main="Random Variable 2", xlab="rv2", col="yellow")
hist(data$TARGET_energy, main="Energy Use (Wh)", xlab="TARGET_energy (Wh)", col="pink")
# Add an overall title for the additional numerical variable histograms
mtext("Histograms of Additional Numerical Variables", outer = TRUE, line = -42, cex = 1.5)
# Compute the correlation matrix
cor_matrix <- cor(data[, sapply(data, is.numeric)], use="complete.obs")
# Install and load the corrplot package
#install.packages("corrplot")
library(corrplot)
## corrplot 0.92 loaded
# Visualize the correlation matrix
corrplot(cor_matrix, method="circle", tl.cex = 0.5, tl.col = "black", tl.srt = 45)
## Correlation Analysis
The correlation matrix plot above visualizes the relationships between the numerical variables in the dataset. Here are the key observations:
These observations provide insights into how different environmental variables interact within the house and with external conditions. The correlations between temperatures, humidity, and energy use can help in understanding the energy dynamics and potential factors influencing energy consumption in the house.
# Install necessary packages if not already installed
#install.packages("GGally")
#install.packages("dplyr")
# Load the packages
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Select the columns for temperature, humidity, and additional numerical variables
temperature_vars <- data %>% select(T1, T2, T3, T4, T5, T6, T7, T8, T9, T_out)
humidity_vars <- data %>% select(RH_1, RH_2, RH_3, RH_4, RH_5, RH_6, RH_7, RH_8, RH_9, RH_out)
additional_vars <- data %>% select(Press_mm_hg, Windspeed, Visibility, Tdewpoint, rv1, rv2, TARGET_energy)
# Create pair plot for temperature variables
ggpairs(temperature_vars, title = "Pair Plot for Temperature Variables")
# Create pair plot for humidity variables
ggpairs(humidity_vars, title = "Pair Plot for Humidity Variables")
# Create pair plot for additional numerical variables
ggpairs(additional_vars, title = "Pair Plot for Additional Numerical Variables")
# Box plots for temperature variables
par(mfrow=c(2, 5)) # Arrange plots in a 2x5 grid
boxplot(data$T1, main="Kitchen", col="blue")
boxplot(data$T2, main="Living Room", col="green")
boxplot(data$T3, main="Laundry Room", col="red")
boxplot(data$T4, main="Office Room", col="purple")
boxplot(data$T5, main="Bathroom", col="orange")
boxplot(data$T6, main="Outside North", col="yellow")
boxplot(data$T7, main="Ironing Room", col="pink")
boxplot(data$T8, main="Teenager Room 2", col="cyan")
boxplot(data$T9, main="Parents Room", col="brown")
boxplot(data$T_out, main="Outside (Chievres)", col="grey")
# Box plots for humidity variables
par(mfrow=c(2, 5)) # Arrange plots in a 2x5 grid
boxplot(data$RH_1, main="Kitchen", col="blue")
boxplot(data$RH_2, main="Living Room", col="green")
boxplot(data$RH_3, main="Laundry Room", col="red")
boxplot(data$RH_4, main="Office Room", col="purple")
boxplot(data$RH_5, main="Bathroom", col="orange")
boxplot(data$RH_6, main="Outside North", col="yellow")
boxplot(data$RH_7, main="Ironing Room", col="pink")
boxplot(data$RH_8, main="Teenager Room 2", col="cyan")
boxplot(data$RH_9, main="Parents Room", col="brown")
boxplot(data$RH_out, main="Outside (Chievres)", col="grey")
#install.packages("ggplot2")
#install.packages("dplyr")
#install.packages("tidyr")
# Load the packages
library(ggplot2)
library(dplyr)
library(tidyr)
# Convert the 'date' column to datetime format
data$date <- as.POSIXct(data$date, format="%Y-%m-%d %H:%M:%S")
# Create long format for temperature variables
temperature_vars <- data %>%
select(date, T1, T2, T3, T4, T5, T6, T7, T8, T9, T_out) %>%
pivot_longer(cols = -date, names_to = "variable", values_to = "value")
# Plot time series for temperature variables
ggplot(temperature_vars, aes(x=date, y=value, color=variable)) +
geom_line() +
labs(title="Temperature Time Series", y="Temperature (Celsius)", x="Date") +
theme_minimal() +
theme(legend.position="bottom") +
scale_color_manual(values=c("blue", "green", "red", "purple", "orange", "yellow", "pink", "cyan", "brown", "grey"))
# Create long format for humidity variables
humidity_vars <- data %>%
select(date, RH_1, RH_2, RH_3, RH_4, RH_5, RH_6, RH_7, RH_8, RH_9, RH_out) %>%
pivot_longer(cols = -date, names_to = "variable", values_to = "value")
# Plot time series for humidity variables
ggplot(humidity_vars, aes(x=date, y=value, color=variable)) +
geom_line() +
labs(title="Humidity Time Series", y="Humidity (%)", x="Date") +
theme_minimal() +
theme(legend.position="bottom") +
scale_color_manual(values=c("blue", "green", "red", "purple", "orange", "yellow", "pink", "cyan", "brown", "grey"))
# Plot time series for energy use
ggplot(data, aes(x=date, y=TARGET_energy)) +
geom_line(color="blue") +
labs(title="Energy Use Time Series", y="Energy Use (Wh)", x="Date") +
theme_minimal()
### 1 Humidity Time Series
The humidity time series plot above visualizes the variations in humidity levels across different rooms and the outside environment over time. Here are the key observations:
The temperature time series plot above visualizes the variations in temperature levels across different rooms and the outside environment over time. Here are the key observations:
The energy use time series plot above visualizes the variations in energy consumption over time. Here are the key observations:
The time series plots provide valuable insights into the behavior of humidity, temperature, and energy use in the building. Understanding these patterns is crucial for developing predictive models and implementing energy-saving measures.
Further analysis could include:
By leveraging these insights, we can develop robust models to predict energy use and optimize energy consumption in low-energy buildings.
#install.packages("forecast")
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Decompose energy use time series
energy_ts <- ts(data$TARGET_energy, frequency=24) # Assuming hourly data
decomposed <- stl(energy_ts, s.window="periodic")
plot(decomposed)
### Critical Analysis
The time series decomposition plot above visualizes the components of the energy use data over time. Here are the key observations:
The decomposition plot provides valuable insights into the underlying patterns in the energy use data: - The strong seasonal component suggests regular cycles, which could be leveraged for predictive modeling. - The increasing trend indicates a long-term rise in energy consumption, which might require addressing underlying causes. - The high variability in the remainder component suggests the presence of additional factors affecting energy use, warranting further investigation.
# Lagged correlation
library(dplyr)
lagged_data <- data %>%
mutate(TARGET_energy_lag1 = lag(TARGET_energy, 1),
TARGET_energy_lag24 = lag(TARGET_energy, 24)) # Example lags
cor(lagged_data[, c("TARGET_energy", "TARGET_energy_lag1", "TARGET_energy_lag24")], use="complete.obs")
## TARGET_energy TARGET_energy_lag1 TARGET_energy_lag24
## TARGET_energy 1.000000000 0.005404513 -0.012206256
## TARGET_energy_lag1 0.005404513 1.000000000 0.007792859
## TARGET_energy_lag24 -0.012206256 0.007792859 1.000000000
library(ggplot2)
#install.packages("reshape2")
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# Heatmap of correlation matrix
cor_matrix <- cor(data[, sapply(data, is.numeric)], use="complete.obs")
melted_cor_matrix <- melt(cor_matrix)
ggplot(data = melted_cor_matrix, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
scale_fill_gradient2(low="blue", high="red", mid="white", midpoint=0) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title="Correlation Heatmap")
## Critical Analysis
The correlation heatmap above visualizes the relationships between various numerical variables in the dataset. Here are the key observations:
The table below shows the correlation between energy use and its lagged values:
| TARGET_energy | TARGET_energy_lag1 | TARGET_energy_lag24 | |
|---|---|---|---|
| TARGET_energy | 1.000000000 | 0.005404513 | -0.012206256 |
| TARGET_energy_lag1 | 0.005404513 | 1.000000000 | 0.007792859 |
| TARGET_energy_lag24 | -0.012206256 | 0.007792859 | 1.000000000 |
Observations: - Weak Correlations with Lagged Values: - The correlation between current energy use and energy use lagged by 1 hour (TARGET_energy_lag1) is very weak (0.005). - The correlation between current energy use and energy use lagged by 24 hours (TARGET_energy_lag24) is also very weak (-0.012).
The correlation heatmap and lagged correlation analysis provide valuable insights into the relationships between different variables in the dataset: - The strong correlations among temperature variables and among humidity variables indicate consistent environmental conditions within the house. - The weak correlations between energy use and its lagged values suggest that current energy consumption is not strongly dependent on past values, indicating the influence of other factors. - The negative correlations between temperature and humidity highlight the interplay between these two environmental factors.